/* Build *UPDATED* auxiliary data sets
	* PD ratio
		do "do/auxiliary_files/data_construction_PD_ratio.do"
		do "do/auxiliary_files/data_construction_PD_ratio_updated.do"
	* Sentiment
		do "do/auxiliary_files/data_construction_sentiment.do"
		do "do/auxiliary_files/data_construction_sentiment_updated.do"
*/	


* THIS VERSION: includes a SUBSAMPLE of countries from the OECD database, including ONLY those for which PD is available (except for Japan, whose PD data is odd -- see below)
* NOTE: the OECD database also includes some non-OECD countries, such as Brazil, China, and Russia, among others; however, this restricted sample only includes OECD members


***************************
* Combining all data sets *
***************************

* Import R&D data from Excel to Stata
	clear all
	import excel "orig/rd.xlsx", sheet("final_series") firstrow
	destring cou, replace
	egen id = group(cou)
	xtset id year
	gen g_rd = ln(rd) - ln(l.rd)
	drop id
	save "data/rd.dta", replace
		
* Merge Penn data with PD and sentiment data
	clear all
	use "orig/penn_world_table/penn_world_table_data.dta"
	rename countrycode cou
	// merge
		// PD ratio
			merge m:m cou year using "data/pd.dta"
			drop _merge
		// local sentiment data
			merge m:m cou year using "data/local_sentiments_panel_y.dta"
			drop _merge
		// local sentiment data measured in December
			merge m:m cou year using "data/local_sentiments_panel_y_m12.dta"
			drop _merge
		// aggregate sentiment data
			merge m:m year using "data/main_sentiments_y.dta"
			drop _merge
		// US eight principal components from Ludvingson and Ng (2009)
			merge m:m year using "data/macro_forecasting_factors_updated_annual.dta"
			drop _merge
		// rd data
			merge m:m cou year using "data/rd.dta"
			drop _merge
	// set data as panel
		egen id = group(cou)
		xtset id year

* Two new countries from the update: Canada and Japan
		tabulate cou if cci != . & pd != ., summarize (pd)
	// the Japan case: its mean PD ratio is 3 sigma's higher (!!) than that of the other countries
		tabstat pd if cci != . & cou == "JPN", statistics(mean sd p50 p75 p90 p95 p99)		
		tabstat pd if cci != . & cou != "JPN", statistics(mean sd p50 p75 p90 p95 p99)		
	// the Canada case: its mean PD ratio is perfectly in line with that of the other countries
		tabstat pd if cci != . & cou == "CAN", statistics(mean sd p50 p75 p90 p95 p99)		
		tabstat pd if cci != . & cou != "CAN", statistics(mean sd p50 p75 p90 p95 p99)		
	// therefore: keep Canada, drop Japan
		drop if cou == "JPN"
	
* Generate macro variables
	// TFP
		// calculate t-step-ahead TFP growth
			foreach x in rtfpna rwtfpna ctfp cwtfp g_rd {
				gen f_`x' = f.`x'
				gen f2_`x' = f2.`x'
				gen f3_`x' = f3.`x'
				gen f4_`x' = f4.`x'
				gen f5_`x' = f5.`x'
				gen f6_`x' = f6.`x'
				gen f7_`x' = f7.`x'
				gen f8_`x' = f8.`x'				
			}
			*** NOTE: the definition of TFP growth (or "RTFP") is in Eq.(5) of "pwt80_capital_labor_and_TFP")
		// summary statistics
			tabstat rtfpna rwtfpna ctfp cwtfp g_rd, statistics(mean sd min med max)
			pwcorr rtfpna rwtfpna ctfp cwtfp g_rd, sig
	// macro variables
		// generate consumption and capital
			// real household consumption
				gen rhc = csh_c * rgdpna
			// real gross capital formation
				gen rgcf = csh_i * rgdpna
		// calculate growth rates
			foreach x in rgdpna emp rhc labsh pl_con rconna rgcf rnna rdana irr {
				gen d_`x' = d.`x'
				gen g_`x' = ln(`x') - ln(l.`x')
			}
		// calculate future values
			foreach x in g_rgcf irr d_irr g_rhc g_emp g_rgdpna {
				gen f_`x' = f.`x'
				gen f2_`x' = f2.`x'
				gen f3_`x' = f3.`x'
				gen f4_`x' = f4.`x'
				gen f5_`x' = f5.`x'
				gen f6_`x' = f6.`x'
				gen f7_`x' = f7.`x'
				gen f8_`x' = f8.`x'				
			}
	// GDP shares
		// net exports 
			gen csh_nx = csh_x - csh_m
		// changes
			foreach x in csh_c csh_i csh_nx csh_x csh_m {
			gen d_`x' = d.`x'
			gen f_d_`x' = fd.`x'			
			}		
	// rescale and express in changes
		tabstat rgdpna rconna emp, statistics(mean sd med)
		gen rgdpna_1tr = rgdpna / 1000000
		gen rconna_1tr = rconna / 1000000
		tabstat rgdpna_1tr rconna_1tr, statistics(mean sd min med max)
		foreach x in rgdpna_1tr rconna_1tr {
			gen d_`x' = d.`x'
		}
				
* Standardize macro variables
	// local
		foreach x in d_rgdpna_1tr d_emp d_rconna_1tr d_labsh d_pl_con {
			egen `x'mean = mean(`x')
			egen `x'sd = sd(`x')
			gen z`x' = (`x'-`x'mean) / `x'sd
			drop `x'sd `x'mean
		}
	// US
		foreach x in F1 F2 F3 F4 F5 F6 F7 F8 {
			egen `x'mean = mean(`x')
			egen `x'sd = sd(`x')
			gen z`x' = (`x'-`x'mean) / `x'sd
			drop `x'sd `x'mean
		}
	
* Dummy for G7 countries (USA, CAN, FRA, DEU, ITA, JPN, GBR)
	// including the U.S.
		gen dum_G7 = (cou == "USA" | cou == "CAN" | cou == "FRA" | cou == "DEU" | cou == "ITA" | cou == "JPN" | cou == "GBR")
	// excluding the U.S.
		gen dum_G7n = (cou == "CAN" | cou == "FRA" | cou == "DEU" | cou == "ITA" | cou == "JPN" | cou == "GBR")
	
* Consumer sentiment
	// country-level
		// orthogonalized
			// changes
				qui: xtreg cci d_rgdpna d_emp d_rconna d_labsh d_pl_con i.year, robust fe
				predict cci_ort, r
			// changes, measured in December
				qui: xtreg cci_m12 d_rgdpna d_emp d_rconna d_labsh d_pl_con i.year, robust fe
				predict cci_ort_m12, r
			// changes, excluding the U.S.
				qui: xtreg cci d_rgdpna d_emp d_rconna d_labsh d_pl_con i.year if cou != "USA", robust fe
				predict cci_ort1, r
		// explained
			gen cci_exp = cci - cci_ort
			gen cci_exp_m12 = cci_m12 - cci_ort_m12
			gen cci_exp1 = cci - cci_ort1 if cou != "USA"
		// standardized
			foreach x in cci cci_ort cci_exp cci_m12 cci_ort_m12 cci_exp_m12 cci_ort1 cci_exp1 {
			egen `x'mean = mean(`x') if pd != .
			egen `x'sd = sd(`x') if pd != .
			gen z`x' = (`x'-`x'mean) / `x'sd
			drop `x'sd `x'mean
			}
		// restrict sample to consumer sentiment subsample
			keep if cci_ort != .
			save "data/final_data_set.dta", replace
	// specific geographic areas
		// U.S.
			clear all
			use "data/final_data_set.dta"
			keep if cou == "USA"
			tsset year
			reg cci d_rgdpna d_emp d_rconna d_labsh d_pl_con, robust
			predict cci_USA_ort, r
			keep cci_USA_ort year
			save "data/cci_USA_ort.dta", replace
			clear all
			use "data/final_data_set.dta"
			merge m:m year using "data/cci_USA_ort.dta"
			drop _merge
			save "data/final_data_set.dta", replace
			erase "data/cci_USA_ort.dta"
		// G7 countries
			clear all
			use "data/final_data_set.dta"
			keep if dum_G7 == 1
			egen cci_G7_ort = rowmean(cci_ort)
			collapse cci_G7_ort, by(year)
			save "data/cci_G7_ort.dta", replace
			clear all
			use "data/final_data_set.dta"
			merge m:m year using "data/cci_G7_ort.dta"
			drop _merge
			save "data/final_data_set.dta", replace
			erase "data/cci_G7_ort.dta"
		// G7 countries except the U.S.
			clear all
			use "data/final_data_set.dta"
			keep if dum_G7n == 1
			egen cci_G7n_ort = rowmean(cci_ort)
			collapse cci_G7n_ort, by(year)
			save "data/cci_G7n_ort.dta", replace
			clear all
			use "data/final_data_set.dta"
			merge m:m year using "data/cci_G7n_ort.dta"
			drop _merge
			save "data/final_data_set.dta", replace
			erase "data/cci_G7n_ort.dta"
		// All OECD countries
			clear all
			use "data/final_data_set.dta"
			egen cci_OECD_ort = rowmean(cci_ort)
			collapse cci_OECD_ort, by(year)
			save "data/cci_OECD_ort.dta", replace
			clear all
			use "data/final_data_set.dta"
			merge m:m year using "data/cci_OECD_ort.dta"
			drop _merge
			save "data/final_data_set.dta", replace
			erase "data/cci_OECD_ort.dta"
		
* Generate returns
		clear all
		use "data/final_data_set.dta"
		xtset id year
	// one-period
		// 1-year-ahead
			gen f_ret= ln(f.pd)-ln(pd)
		// 2-year-ahead
			gen f2_ret= ln(f2.pd)-ln(f.pd)
		// 3-year-ahead
			gen f3_ret= ln(f3.pd)-ln(f2.pd)
		// 4-year-ahead
			gen f4_ret= ln(f4.pd)-ln(f3.pd)
		// 5-year-ahead
			gen f5_ret= ln(f5.pd)-ln(f4.pd)
	// cumulative	
		// 1 and 2 years ahead
			gen f12_ret= ln(f2.pd)-ln(pd)
		// 1 to 3 years ahead
			gen f13_ret= ln(f3.pd)-ln(pd)
		// 1 to 4 years ahead
			gen f14_ret= ln(f4.pd)-ln(pd)
		// 1 to 5 years ahead
			gen f15_ret= ln(f5.pd)-ln(pd)
		
	save "data/final_data_set.dta", replace
		

		
*********************************
* Variables from Colacito et al *
*********************************

* Standardize PD ratio
	clear all
	use "data/final_data_set.dta"
	xtset id year

	// all countries
		foreach x in pd {
		egen `x'mean = mean(`x') if pd != .
		egen `x'sd = sd(`x') if pd != .
		gen z`x' = (`x'-`x'mean) / `x'sd
		drop `x'sd `x'mean
		}
	// all countries except the U.S.
		gen pd1 = pd if cou != "USA"
		foreach x in pd1 {
		egen `x'mean = mean(`x') if pd != .
		egen `x'sd = sd(`x') if pd != .
		gen z`x' = (`x'-`x'mean) / `x'sd
		drop `x'sd `x'mean
		}

* PD decomposition: sentiment (CCI ORT) v. fundamentals
	// all countries
		xtreg pd cci_ort i.year, robust fe
		predict xb	// sentiment component
		predict res, r	// economic component
	// all countries, measured in December
		xtreg pd cci_ort_m12 i.year, robust fe
		predict xb_m12	// sentiment component
		predict res_m12, r	// economic component
	// all countries except the U.S.
		xtreg pd cci_ort1 i.year if cou != "USA", robust fe
		predict xb1	// sentiment component
		predict res1, r	// economic component
			
* Original investment model from Colacito et al.		
	// generate residuals from equation 1 (p. 3419)
		// all countries
			qui: xtreg rtfpna l.pd i.year, robust fe
				predict res_prod_orig, r
		// all countries except the U.S.
			qui: xtreg rtfpna l.pd i.year if cou != "USA", robust fe
				predict res_prod_orig1, r		
	// generate residuals from equation 2 (p. 3419)
		// all countries
			qui: xtreg pd l.pd i.year, robust fe
				predict res_econ_orig, r
		// all countries except the U.S.
			qui: xtreg pd l.pd i.year if cou != "USA", robust fe
				predict res_econ_orig1, r
	
* Updated investment model with sentiment decomposition
	// all countries
			qui: xtreg rtfpna l.res l.xb i.year, robust fe
				predict exp_prod
				predict res_prod, r
			qui: xtreg res l.res i.year, robust fe
				predict res_econ, r
			qui: xtreg xb l.xb i.year, robust fe
				predict res_sent, r
	// all countries, sentiment measured in December
			qui: xtreg rtfpna l.res_m12 l.xb_m12 i.year, robust fe
				predict exp_prod_m12
				predict res_prod_m12, r
			qui: xtreg res_m12 l.res_m12 i.year, robust fe
				predict res_econ_m12, r
			qui: xtreg xb_m12 l.xb_m12 i.year, robust fe
				predict res_sent_m12, r
	// all countries except the U.S.
			qui: xtreg rtfpna l.res l.xb i.year if cou != "USA", robust fe
				predict exp_prod1
				predict res_prod1, r
			qui: xtreg res l.res i.year if cou != "USA", robust fe
				predict res_econ1, r
			qui: xtreg xb l.xb i.year if cou != "USA", robust fe
				predict res_sent1, r

* Standardize residuals
	foreach x in ///
		xb xb1 res res1 xb_m12 res_m12 ///
		res_prod_orig res_prod_orig1 ///
		res_econ_orig res_econ_orig1 ///
		res_prod res_prod1 ///
		res_econ res_econ1 res_econ_m12 ///
		res_sent res_sent1 res_sent_m12 ///
			{
				egen `x'mean = mean(`x') if pd != . & cci_ort != .
				egen `x'sd = sd(`x') if pd != . & cci_ort != .
				gen z`x'=(`x'-`x'mean)/`x'sd
				drop `x'sd `x'mean
			}
		
* Generate variables for PD decomposition from the Appendix
	// lag variables
		foreach x in zpd zpd1 cci_ort cci_ort_m12 ///
			{
				gen l_`x' = l.`x'
			}
	// generate lagged sentiment excluding the U.S.
		gen l_cci_ort1 = l_cci_ort if cou != "USA"
	// standardize sentiment
		foreach x in l_cci_ort l_cci_ort1 l_cci_ort_m12 ///
			{
				egen `x'mean = mean(`x') if pd != . & cci_ort != .
				egen `x'sd = sd(`x') if pd != . & cci_ort != .
				gen z`x'=(`x'-`x'mean)/`x'sd
				drop `x'sd `x'mean
			}

	save "data/final_data_set.dta", replace

// avoid clutter and delete missing observations
		drop if year == .
	
	save "data/final_data_set.dta", replace
	clear all

	
	